Week 4: Clustering and classification

date() 
## [1] "Mon Nov 27 12:41:05 2023"

1) Data wrangling exercises

R-Script: https://github.com/venkatharina/IODS-project/blob/Data/'create_alc.R

2) Analysis exercises

library(MASS)
library(tidyr)
library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)

#loading the data
data("Boston")

#exploring the dataset
dim(Boston) #504 rows and 14 variables
## [1] 506  14
str(Boston) #numeric and integrin vectors
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
#Data describes methodological problems associated with the use of housing market data to measure the willingness to pay for clean air
#With the use of a hedonic housing price model and data for the Boston metropolitan area, quantitative estimates of the willingness to pay for air quality improvements are generated
#Marginal air pollution damages (as revealed in the housing market) are found to increase with the level of air pollution and with household income

#graphical overview and summary of the data:
plot(Boston)
pairs(Boston)

summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
cor_matrix <- cor(Boston) 
corrplot(cor_matrix, method="circle")

#lots of variables, some seem to not correlate, some seem to correlate negative/positive to each others

#scaling data:
boston_scaled <- as.data.frame(scale(Boston))
summary(boston_scaled)
##       crim                 zn               indus              chas        
##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563   Min.   :-0.2723  
##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668   1st Qu.:-0.2723  
##  Median :-0.390280   Median :-0.48724   Median :-0.2109   Median :-0.2723  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150   3rd Qu.:-0.2723  
##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202   Max.   : 3.6648  
##       nox                rm               age               dis         
##  Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331   Min.   :-1.2658  
##  1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366   1st Qu.:-0.8049  
##  Median :-0.1441   Median :-0.1084   Median : 0.3171   Median :-0.2790  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059   3rd Qu.: 0.6617  
##  Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164   Max.   : 3.9566  
##       rad               tax             ptratio            black        
##  Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047   Min.   :-3.9033  
##  1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876   1st Qu.: 0.2049  
##  Median :-0.5225   Median :-0.4642   Median : 0.2746   Median : 0.3808  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058   3rd Qu.: 0.4332  
##  Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372   Max.   : 0.4406  
##      lstat              medv        
##  Min.   :-1.5296   Min.   :-1.9063  
##  1st Qu.:-0.7986   1st Qu.:-0.5989  
##  Median :-0.1811   Median :-0.1449  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6024   3rd Qu.: 0.2683  
##  Max.   : 3.5453   Max.   : 2.9865
#all the data was scaled to zero (mean to zero)

#`crim` (per capita crime rate by town)
#cut the variable by to get the high, low and middle rates of crime into their own categories
boston_scaled$crim <- as.numeric(boston_scaled$crim)
#using the quantiles as the break points (bins)
bins <- quantile(boston_scaled$crim)
bins
##           0%          25%          50%          75%         100% 
## -0.419366929 -0.410563278 -0.390280295  0.007389247  9.924109610
#creating categorical variable
crime <- cut(boston_scaled$crim, breaks =bins, include.lowest = TRUE)
#removing original crim from the dataset
boston_scaled <- dplyr::select(boston_scaled, -crim)
#adding the new categorical value to scaled data
boston_scaled <- data.frame(boston_scaled, crime)
#creating testing and training data:
n <- nrow(Boston)
ind <- sample(n,  size = n * 0.8)
train <- boston_scaled[ind,]
test <- boston_scaled[-ind,]
#doing linear discriminant analysis
lda.fit = lda(crime ~ ., data=train)
lda.fit
## Call:
## lda(crime ~ ., data = train)
## 
## Prior probabilities of groups:
## [-0.419,-0.411]  (-0.411,-0.39] (-0.39,0.00739]  (0.00739,9.92] 
##       0.2500000       0.2475248       0.2524752       0.2500000 
## 
## Group means:
##                         zn      indus        chas        nox         rm
## [-0.419,-0.411]  0.9013847 -0.9384597 -0.11640431 -0.8869545  0.4266499
## (-0.411,-0.39]  -0.1007028 -0.2738895  0.04263895 -0.5704814 -0.1364105
## (-0.39,0.00739] -0.3855121  0.1947801  0.19085920  0.3661826  0.1085060
## (0.00739,9.92]  -0.4872402  1.0171306 -0.03844192  0.9971081 -0.4429560
##                        age        dis        rad        tax     ptratio
## [-0.419,-0.411] -0.9090163  0.8747633 -0.6805407 -0.7321566 -0.43862212
## (-0.411,-0.39]  -0.3061545  0.3604235 -0.5466022 -0.4636199 -0.01779888
## (-0.39,0.00739]  0.4007615 -0.3908046 -0.3851187 -0.2923188 -0.25388687
## (0.00739,9.92]   0.7971886 -0.8652127  1.6379981  1.5139626  0.78062517
##                       black       lstat        medv
## [-0.419,-0.411]  0.37485899 -0.77627945  0.50670982
## (-0.411,-0.39]   0.31315966 -0.13288826 -0.01955032
## (-0.39,0.00739]  0.06171543  0.02056259  0.16400490
## (0.00739,9.92]  -0.83002806  0.90410140 -0.69642395
## 
## Coefficients of linear discriminants:
##                 LD1          LD2         LD3
## zn       0.08857954  0.640000887 -0.86897064
## indus    0.02781640 -0.388425807  0.31351953
## chas    -0.08078960 -0.017076849  0.15027800
## nox      0.41951459 -0.763339463 -1.37670365
## rm      -0.09178916 -0.090471137 -0.16863777
## age      0.24208158 -0.347410085  0.08065747
## dis     -0.07720764 -0.270819956  0.29525111
## rad      2.97396015  0.814847092 -0.11624894
## tax     -0.01961950  0.254358047  0.52832916
## ptratio  0.12862145 -0.081777381 -0.19097489
## black   -0.11925499 -0.001842029  0.10764266
## lstat    0.23550054 -0.221497287  0.32857265
## medv     0.20708297 -0.401281969 -0.20636557
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.9440 0.0427 0.0133
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  graphics::arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}
classes <- as.numeric(train$crime)
#plotting lda results biplot)
plot(lda.fit, dimen = 2)
lda.arrows(lda.fit, myscale = 1)

#saving the crime categories from the test set
correct_classes <- test$crime
#then removing the categorical crime variable from the test dataset
test <- dplyr::select(test, -crime)
# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
##                  predicted
## correct           [-0.419,-0.411] (-0.411,-0.39] (-0.39,0.00739] (0.00739,9.92]
##   [-0.419,-0.411]              15              7               4              0
##   (-0.411,-0.39]                6             15               5              0
##   (-0.39,0.00739]               1              9              14              0
##   (0.00739,9.92]                0              0               0             26
#predicted values seem correct only in class 4 (0.00739,9.92)
#mostly predictions seem okay-ish (majority in right class), but it could be better

#reloading the data
data("Boston")
#scaling data:
boston_scaled <- as.data.frame(scale(Boston))
summary(boston_scaled$dis)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.2658 -0.8049 -0.2790  0.0000  0.6617  3.9566
#calculating distances between the observations
distance <- dist(boston_scaled)
summary(distance)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1343  3.4625  4.8241  4.9111  6.1863 14.3970
#trying to identify right cluster number
set.seed(140)
k_max <- 10
twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#right cluster number seems to be 6 for k-means clustering
km <- kmeans(boston_scaled, centers = 6)
# plot the dataset with clusters
pairs(boston_scaled, col = km$cluster)

#INTERPRET RESULTS!!!! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

############ BONUS ############

#reloading data:
data("Boston")
#scaling data:
boston_scaled <- as.data.frame(scale(Boston))
#k-clustering
km3 <- kmeans(boston_scaled, centers = 8)
boston_scaled$kmcluster <- as.numeric(km3$cluster)

lda.fit2 = lda(kmcluster~., data=boston_scaled)
lda.fit2
## Call:
## lda(kmcluster ~ ., data = boston_scaled)
## 
## Prior probabilities of groups:
##          1          2          3          4          5          6          7 
## 0.21343874 0.04150198 0.21541502 0.06521739 0.10869565 0.15415020 0.10276680 
##          8 
## 0.09881423 
## 
## Group means:
##         crim           zn      indus        chas        nox          rm
## 1 -0.3768106 -0.425505051 -0.3701112  0.09221725 -0.2286148 -0.35000772
## 2  3.2082952 -0.487240187  1.0149946 -0.27232907  1.0998477 -1.45566793
## 3 -0.4060502 -0.005364112 -0.6583486 -0.27232907 -0.8405622 -0.04480167
## 4  1.1193656 -0.487240187  1.0149946 -0.27232907  0.9950574 -0.19450805
## 5 -0.4146327  2.524683754 -1.2040537 -0.20074543 -1.2034679  0.73835915
## 6  0.4620310 -0.487240187  1.0149946  0.13147608  1.0021383 -0.15800782
## 7 -0.2727474 -0.487240187  1.5613334  0.10623826  1.1631724 -0.63230595
## 8 -0.3681800 -0.053323566 -0.7442735  0.59383298 -0.2416605  1.68533550
##          age        dis        rad         tax     ptratio      black
## 1  0.3376923 -0.1286995 -0.5799077 -0.53244743  0.22283672  0.2729966
## 2  1.0064301 -1.0710604  1.6596029  1.52941294  0.80577843 -0.1691195
## 3 -1.0244892  0.8700429 -0.5720054 -0.70802741 -0.07989337  0.3722489
## 4  0.7406815 -0.8461740  1.6596029  1.52941294  0.80577843 -3.2850509
## 5 -1.4205771  1.6272606 -0.6832698 -0.53843478 -0.89403340  0.3540831
## 6  0.6920432 -0.7470446  1.6596029  1.52941294  0.80577843  0.1640227
## 7  0.9324774 -0.9083143 -0.6130366  0.03111252 -0.35520300 -0.1409865
## 8  0.1056916 -0.2903328 -0.4926242 -0.78414273 -1.08156698  0.3392477
##        lstat        medv
## 1  0.1641254 -0.25817616
## 2  2.0102765 -1.39479170
## 3 -0.5625097  0.09778119
## 4  1.1195132 -1.03254705
## 5 -0.9678176  0.83523455
## 6  0.3945960 -0.31539874
## 7  0.6799267 -0.51668841
## 8 -0.9695286  1.72241105
## 
## Coefficients of linear discriminants:
##                 LD1          LD2         LD3          LD4          LD5
## crim     0.19364384  0.126100236 -0.20191652 -0.376580103 -0.838531764
## zn      -0.05343615  1.094813039  1.50799254 -1.133131460  0.197678623
## indus    0.55399704 -1.280702654  1.13241037 -1.197258173  0.124603383
## chas    -0.03302284 -0.024772810 -0.14210875  0.080434989  0.122406528
## nox      0.05435103 -0.424948658  0.26397352 -0.771840326  0.285880375
## rm      -0.04813200  0.181941956  0.09971698  0.277384152  0.689446855
## age      0.14103637 -0.651476207 -0.09019746  0.008315478  0.502754473
## dis     -0.33219936  0.286780273  0.03476554 -0.281273725 -0.252480041
## rad      5.93891512  1.865585385 -1.41594980  0.744695993  0.427251550
## tax     -0.05791010  0.519204288  0.49438996 -0.580198607  0.087932891
## ptratio  0.21221433 -0.011384841  0.01954111  0.048612579 -0.116143664
## black   -0.19991944  0.269843607 -1.52326403 -1.543950260 -0.002263929
## lstat    0.07512642 -0.006494829  0.08239517  0.043407190 -0.209298121
## medv    -0.08095390  0.223086304 -0.03618324 -0.137450228  0.487212102
##                  LD6         LD7
## crim    -1.055793974  0.76410951
## zn      -0.472879553 -0.75684606
## indus    0.908040391  1.05671771
## chas    -0.139637868 -0.18984158
## nox      0.186010320  0.32881295
## rm      -0.077931268  0.31058189
## age     -0.573790467 -0.91403286
## dis      0.721069945  0.66003474
## rad      0.851480550  0.28819620
## tax     -0.245047274 -0.77133610
## ptratio -0.008931086 -0.50612180
## black    0.001693988 -0.02201946
## lstat   -0.741261471  0.23456713
## medv    -0.594939439  0.54523880
## 
## Proportion of trace:
##    LD1    LD2    LD3    LD4    LD5    LD6    LD7 
## 0.7632 0.1051 0.0506 0.0403 0.0205 0.0141 0.0062
############ SUPERBONUS ############

model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404  13
dim(lda.fit$scaling)
## [1] 13  3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
#Next, install and access the plotly package. Create a 3D plot (cool!) of the columns of the matrix product using the code below.
#install.packages("plotly")
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers')
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', colors= 'train$crime')
#p10 <- plot_ly() %>% 
#  add_trace(data = matrix_product,
#            x = matrix_product$LD1,
#            y = matrix_product$LD2,
#            z = matrix_product$LD3,
#            color = ~City,
#            colors = brewer.pal(length(names(table(train$crime))),
#                                "Paired"),
#            type= 'scatter3d',
#           )
#p10